{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Find co-occurring words which are specific to each of the subsets we've defined for the SOMA tweets sample.\n",
    "\n",
    "##### Data\n",
    "The data are word counts from\n",
    "* all tweets\n",
    "* all tweets containing one of several keywords (per topic-defined subset)\n",
    "\n",
    "##### Preprocessing\n",
    "The word counts are normalized so that each word represents its frequency in the corpus to which it pertains.\n",
    "\n",
    "##### Analysis\n",
    "First variant:\n",
    "* compile a complete dictionary with all words from all tweets\n",
    "* for each subset, divide each word's frequency by its frequency in all tweets\n",
    "* multiply that by the count of that word in the subset (so more commonly occurring words are weighed higher)\n",
    "* list the top most frequent words from each subset\n",
    "\n",
    "The formula is\n",
    "$$\n",
    "Score_s^w = f_s^w f^w N_s^w\n",
    "$$\n",
    "\n",
    "where $f_{subset}^w$ is the relative frequency of word $w$ in subset $s$, $f^w$ is the frequency of word $w$ overall, and $N_s^w$ is the count of word $w$ in subset $s$.\n",
    "\n",
    "Second variant:\n",
    "?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import string\n",
    "import pandas as pd\n",
    "import simple_json as json\n",
    "from smappPy.unicode_csv import UnicodeReader"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Loading the data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The _all_ dataset (all wordcounts in all tweets, minus stopwords, urls) is big. It has 36M words in it.\n",
    "\n",
    "When loading it in, let's throw away\n",
    "* words that are used only once (count=1)\n",
    "* words where less than 50% of the characters are non-ascii (chinese, arabic, ..)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10000000\n",
      "20000000\n"
     ]
    }
   ],
   "source": [
    "DATA_FOLDER = '/data/akalin/jona/smapp/data/yougov-outputs/merged/'\n",
    "\n",
    "d = dict()\n",
    "thrown_away = 0\n",
    "with open(os.path.join(DATA_FOLDER,'all.csv'), 'rt') as infile:\n",
    "    reader = UnicodeReader(infile)\n",
    "    for i,(word, count_str) in enumerate(reader):\n",
    "        count = int(count_str)\n",
    "        if count > 1 and (sum([c in string.ascii_letters for c in word]) / float(len(word)) > .5):\n",
    "            d[word] = count\n",
    "        else:\n",
    "            thrown_away += 1\n",
    "        if not ((i+1) % 10000000):\n",
    "            print i+1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "all_words = d.keys()\n",
    "all_ser = pd.Series([d[word] for word in all_words], index=all_words)\n",
    "del d, all_words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jronen/.virtualenvs/bio2710/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort is deprecated, use sort_values(inplace=True) for for INPLACE sorting\n",
      "  if __name__ == '__main__':\n"
     ]
    }
   ],
   "source": [
    "all_ser.sort(ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "n        91895535\n",
       "amp      87637987\n",
       "new      58693934\n",
       "via      44616201\n",
       "today    42254056\n",
       "dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_ser.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = {\n",
    "    'nhs': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'nhs.csv'), encoding='utf8'),\n",
    "    'isis': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'isis.csv'), encoding='utf8'),\n",
    "    'ebola': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'ebola.csv'), encoding='utf8'),\n",
    "    'economy': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'economy.csv'), encoding='utf8'),\n",
    "    'greek_exit': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'greek_exit.csv'), encoding='utf8'),\n",
    "    'immigration': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'immigration.csv'), encoding='utf8'),\n",
    "    'unemployment': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'unemployment.csv'), encoding='utf8'),\n",
    "    'ties_to_the_eu': pd.Series.from_csv(os.path.join(DATA_FOLDER, 'ties_to_the_eu.csv'), encoding='utf8'),\n",
    "}"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Get rid of once-occurring words\n",
    "for subset in data:\n",
    "    data[subset] = data[subset][data[subset] > 1]\n",
    "\n",
    "# Normalize\n",
    "normalized_data = dict()\n",
    "for subset in data:\n",
    "    normalized_data[subset] = data[subset] / data[subset].sum()\n",
    "normalized_all_ser = all_ser / all_ser.sum()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## First variant"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jronen/.virtualenvs/bio2710/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
     ]
    }
   ],
   "source": [
    "tf_ranked = dict()\n",
    "for subset in normalized_data:\n",
    "    r = pd.DataFrame()\n",
    "    r['f_' + subset] = normalized_data[subset]\n",
    "    r['f_total'] = normalized_all_ser.loc[normalized_data[subset].index]\n",
    "    r['N_' + subset] = data[subset]\n",
    "    r['N_total'] = all_ser.loc[data[subset].index]\n",
    "    r['score'] = (normalized_data[subset] / normalized_all_ser.loc[normalized_data[subset].index]) * data[subset]\n",
    "    r.sort(columns='score', ascending=False, inplace=True)\n",
    "    tf_ranked[subset] = r"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "======= isis =======\n",
      "                  f_isis       f_total   N_isis  N_total         score\n",
      "isis            0.093531  2.435509e-04  2613910  2613910  1.003824e+09\n",
      "jihad           0.005785  1.506322e-05   161666   161666  6.208482e+07\n",
      "kobane          0.003339  8.694152e-06    93310    93310  3.583397e+07\n",
      "islam           0.007323  9.175941e-05   204664   984808  1.633421e+07\n",
      "iraq            0.008169  1.221879e-04   228311  1311382  1.526480e+07\n",
      "syria           0.008255  1.523099e-04   230709  1634666  1.250451e+07\n",
      "fundamentalist  0.000668  1.740321e-06    18678    18678  7.172939e+06\n",
      "iraqi           0.002382  2.496790e-05    66567   267968  6.350410e+06\n",
      "mosul           0.001401  8.688841e-06    39145    93253  6.310395e+06\n",
      "kurds           0.001550  1.160764e-05    43320   124579  5.784939e+06\n",
      "kurdish         0.001601  1.290901e-05    44754   138546  5.551834e+06\n",
      "quran           0.001486  1.241109e-05    41535   133202  4.973755e+06\n",
      "ypg             0.000994  5.671930e-06    27769    60874  4.864692e+06\n",
      "raqqa           0.000787  3.673887e-06    22001    39430  4.714382e+06\n",
      "palmyra         0.000970  6.457488e-06    27110    69305  4.072501e+06\n",
      "islamic         0.003417  8.753756e-05    95496   939497  3.727707e+06\n",
      "twitterkurds    0.000814  5.066107e-06    22746    54372  3.654277e+06\n",
      "fighters        0.002211  3.758369e-05    61786   403367  3.634516e+06\n",
      "ramadi          0.000885  6.142930e-06    24731    65929  3.562654e+06\n",
      "muslim          0.003584  1.124677e-04   100150  1207060  3.191095e+06\n",
      "kobani          0.000737  4.985138e-06    20610    53503  3.048910e+06\n",
      "beheading       0.001010  9.828743e-06    28234   105487  2.902102e+06\n",
      "bb4sp           0.002226  5.290332e-05    62201   567785  2.616842e+06\n",
      "beheadings      0.000581  3.739576e-06    16243    40135  2.524503e+06\n",
      "peshmerga       0.000547  3.497414e-06    15277    37536  2.387783e+06\n",
      "militants       0.001289  1.982343e-05    36026   212755  2.342713e+06\n",
      "baghdadi        0.000469  2.629676e-06    13111    28223  2.339027e+06\n",
      "tikrit          0.000760  7.104215e-06    21238    76246  2.271836e+06\n",
      "jordanian       0.000611  4.785930e-06    17084    51365  2.182120e+06\n",
      "beheads         0.000442  2.543396e-06    12358    27297  2.148564e+06\n",
      "daesh           0.000659  5.677894e-06    18429    60938  2.140337e+06\n",
      "caliphate       0.000566  4.178523e-06    15806    44846  2.139375e+06\n",
      "yazidi          0.000487  3.124435e-06    13624    33533  2.125708e+06\n",
      "airstrikes      0.000928  1.158705e-05    25934   124358  2.076979e+06\n",
      "assad           0.001372  2.538197e-05    38348   272412  2.073127e+06\n",
      "assyrian        0.000405  2.211600e-06    11309    23736  2.069225e+06\n",
      "syrian          0.001795  4.441509e-05    50163   476685  2.027226e+06\n",
      "islamicstate    0.000688  6.686233e-06    19233    71760  1.979604e+06\n",
      "beheaded        0.000690  7.063125e-06    19279    75805  1.882946e+06\n",
      "libya           0.001623  4.031912e-05    45345   432725  1.824793e+06\n",
      "isil            0.001139  2.031903e-05    31839   218074  1.785179e+06\n",
      "christians      0.001632  4.220666e-05    45604   452983  1.763156e+06\n",
      "tcot            0.003158  1.645065e-04    88255  1765566  1.694187e+06\n",
      "muhammad        0.001105  2.034717e-05    30889   218376  1.677913e+06\n",
      "obama           0.004795  4.014422e-04   134018  4308479  1.600918e+06\n",
      "qaeda           0.000972  1.777265e-05    27178   190745  1.487129e+06\n",
      "islamorealism   0.000151  4.322385e-07     4214     4639  1.470048e+06\n",
      "jihadi          0.000988  1.891386e-05    27609   202993  1.442072e+06\n",
      "muslims         0.002012  7.846307e-05    56224   842105  1.441597e+06\n",
      "mutilator       0.000152  4.644770e-07     4238     4985  1.383641e+06\n",
      "-----------\n",
      "======= unemployment =======\n",
      "                 f_unemployment       f_total  N_unemployment  N_total  \\\n",
      "unemployment           0.094512  3.154251e-05          338530   338530   \n",
      "rate                   0.015489  1.181759e-04           55478  1268323   \n",
      "muthafukka             0.000377  1.390170e-07            1349     1492   \n",
      "youth                  0.011270  1.266536e-04           40369  1359310   \n",
      "zerohours              0.001622  2.918425e-06            5811    31322   \n",
      "nsubsides              0.000207  6.969484e-08             740      748   \n",
      "welfarereform          0.000929  1.527603e-06            3326    16395   \n",
      "lowest                 0.003552  2.885674e-05           12724   309705   \n",
      "figures                0.005500  7.235611e-05           19700   776562   \n",
      "toryscum               0.000702  1.690007e-06            2515    18138   \n",
      "falls                  0.004444  6.922152e-05           15918   742920   \n",
      "jobless                0.001417  7.348240e-06            5075    78865   \n",
      "employment             0.003144  4.477530e-05           11262   480551   \n",
      "wages                  0.002751  3.657461e-05            9855   392537   \n",
      "underemployment        0.000316  4.923363e-07            1132     5284   \n",
      "jobsreport             0.000257  3.867691e-07             922     4151   \n",
      "jobs                   0.008931  5.257775e-04           31988  5642908   \n",
      "nspain                 0.000331  7.207080e-07            1184     7735   \n",
      "psychocrats            0.000096  6.186815e-08             344      664   \n",
      "massaging              0.000319  7.400884e-07            1144     7943   \n",
      "longtermplan           0.000387  1.123410e-06            1387    12057   \n",
      "ngreece                0.000314  8.093174e-07            1126     8686   \n",
      "satire                 0.001632  2.222669e-05            5845   238548   \n",
      "wca                    0.000712  4.369811e-06            2551    46899   \n",
      "unemployed             0.001431  1.879254e-05            5125   201691   \n",
      "nitaly                 0.000289  8.081061e-07            1035     8673   \n",
      "inflation              0.001602  2.592611e-05            5737   278252   \n",
      "hcxvwx1n               0.000031  1.043559e-08             112      112   \n",
      "bedroomtax             0.001058  1.230962e-05            3791   132113   \n",
      "blsdata                0.000116  1.515024e-07             416     1626   \n",
      "nfrance                0.000336  1.365199e-06            1203    14652   \n",
      "economy                0.003501  1.580291e-04           12540  1696048   \n",
      "claimant               0.000503  3.369671e-06            1802    36165   \n",
      "nportugal              0.000174  4.066154e-07             623     4364   \n",
      "technological          0.000541  4.258839e-06            1937    45708   \n",
      "low                    0.003782  2.180999e-04           13548  2340757   \n",
      "rates                  0.002027  6.388036e-05            7260   685596   \n",
      "fell                   0.002084  6.826899e-05            7464   732697   \n",
      "benefit                0.002809  1.242713e-04           10063  1333742   \n",
      "growth                 0.002983  1.435531e-04           10686  1540684   \n",
      "furtherfalls           0.000021  6.894944e-09              74       74   \n",
      "dla                    0.000564  5.231120e-06            2020    56143   \n",
      "fallen                 0.001632  4.465334e-05            5845   479242   \n",
      "percent                0.001508  3.811404e-05            5400   409059   \n",
      "usleadership           0.000034  1.891451e-08             120      203   \n",
      "poverty                0.002508  1.075682e-04            8983  1154476   \n",
      "jsa                    0.000455  3.631120e-06            1629    38971   \n",
      "recession              0.000861  1.300703e-05            3083   139598   \n",
      "pips                   0.000705  8.756766e-06            2524    93982   \n",
      "higher                 0.002291  9.367500e-05            8207  1005367   \n",
      "\n",
      "                        score  \n",
      "unemployment     1.014352e+09  \n",
      "rate             7.271152e+06  \n",
      "muthafukka       3.654657e+06  \n",
      "youth            3.592270e+06  \n",
      "zerohours        3.230307e+06  \n",
      "nsubsides        2.193579e+06  \n",
      "welfarereform    2.021737e+06  \n",
      "lowest           1.566356e+06  \n",
      "figures          1.497434e+06  \n",
      "toryscum         1.044908e+06  \n",
      "falls            1.021942e+06  \n",
      "jobless          9.785416e+05  \n",
      "employment       7.908298e+05  \n",
      "wages            7.413515e+05  \n",
      "underemployment  7.266437e+05  \n",
      "jobsreport       6.136215e+05  \n",
      "jobs             5.433288e+05  \n",
      "nspain           5.430434e+05  \n",
      "psychocrats      5.339988e+05  \n",
      "massaging        4.936953e+05  \n",
      "longtermplan     4.780850e+05  \n",
      "ngreece          4.373695e+05  \n",
      "satire           4.291259e+05  \n",
      "wca              4.157659e+05  \n",
      "unemployed       3.902049e+05  \n",
      "nitaly           3.700862e+05  \n",
      "inflation        3.544238e+05  \n",
      "hcxvwx1n         3.355904e+05  \n",
      "bedroomtax       3.259519e+05  \n",
      "blsdata          3.189023e+05  \n",
      "nfrance          2.959551e+05  \n",
      "economy          2.778104e+05  \n",
      "claimant         2.690374e+05  \n",
      "nportugal        2.664912e+05  \n",
      "technological    2.459566e+05  \n",
      "low              2.349555e+05  \n",
      "rates            2.303544e+05  \n",
      "fell             2.278297e+05  \n",
      "benefit          2.274964e+05  \n",
      "growth           2.220794e+05  \n",
      "furtherfalls     2.217294e+05  \n",
      "dla              2.177703e+05  \n",
      "fallen           2.136022e+05  \n",
      "percent          2.135960e+05  \n",
      "usleadership     2.125485e+05  \n",
      "poverty          2.094351e+05  \n",
      "jsa              2.040291e+05  \n",
      "recession        2.040137e+05  \n",
      "pips             2.031073e+05  \n",
      "higher           2.007408e+05  \n",
      "-----------\n",
      "======= greek_exit =======\n",
      "              f_greek_exit       f_total  N_greek_exit  N_total         score\n",
      "greece            0.081793  1.800492e-04       1932378  1932378  8.778431e+08\n",
      "syriza            0.011152  2.454842e-05        263466   263466  1.196877e+08\n",
      "tsipras           0.008996  1.980377e-05        212544   212544  9.655475e+07\n",
      "eurozone          0.008811  1.939585e-05        208166   208166  9.456591e+07\n",
      "grexit            0.006707  1.476468e-05        158462   158462  7.198631e+07\n",
      "greek             0.008932  9.965151e-05        211013  1069510  1.891291e+07\n",
      "imf               0.004438  2.588753e-05        104846   277838  1.797367e+07\n",
      "bailout           0.003433  1.612336e-05         81102   173044  1.726758e+07\n",
      "creditors         0.002337  8.093174e-06         55213    86860  1.594365e+07\n",
      "eurogroup         0.002195  8.421709e-06         51855    90386  1.351465e+07\n",
      "euro              0.005413  7.153766e-05        127876   767778  9.675362e+06\n",
      "athens            0.002557  1.737433e-05         60421   186470  8.893888e+06\n",
      "greferendum       0.001425  5.901234e-06         33663    63335  8.128055e+06\n",
      "debt              0.005893  1.051204e-04        139213  1128205  7.803624e+06\n",
      "troika            0.001248  5.059026e-06         29494    54296  7.278205e+06\n",
      "varoufakis        0.001711  9.721126e-06         40434   104332  7.118700e+06\n",
      "oxi               0.001242  5.977078e-06         29345    64149  6.098221e+06\n",
      "ecb               0.002454  2.532336e-05         57971   271783  5.617254e+06\n",
      "merkel            0.002000  2.348968e-05         47259   252103  4.024536e+06\n",
      "default           0.001675  1.744356e-05         39566   187213  3.798683e+06\n",
      "austerity         0.004373  1.229664e-04        103306  1319737  3.673571e+06\n",
      "greeks            0.001426  1.332504e-05         33679   143011  3.603078e+06\n",
      "eu                0.007125  3.551710e-04        168341  3811873  3.377273e+06\n",
      "ekloges2015       0.000409  1.264756e-06          9661    13574  3.123641e+06\n",
      "greececrisis      0.000533  2.579734e-06         12602    27687  2.605721e+06\n",
      "dijsselbloem      0.000463  2.082180e-06         10927    22347  2.427211e+06\n",
      "finmin            0.000389  1.474400e-06          9184    15824  2.421432e+06\n",
      "santorini         0.000550  3.052876e-06         12994    32765  2.340994e+06\n",
      "referendum        0.002824  8.649680e-05         66715   928327  2.178065e+06\n",
      "germany           0.003307  1.234154e-04         78122  1324556  2.093154e+06\n",
      "europe            0.004297  2.115910e-04        101514  2270901  2.061477e+06\n",
      "drachma           0.000313  1.157326e-06          7384    12421  1.994121e+06\n",
      "alexis            0.001383  2.437083e-05         32663   261560  1.852958e+06\n",
      "crisis            0.003627  1.714793e-04         85677  1840402  1.811927e+06\n",
      "samaras           0.000422  2.494759e-06          9971    26775  1.686836e+06\n",
      "schaeuble         0.000293  1.256184e-06          6913    13482  1.610288e+06\n",
      "vouli             0.000231  8.025156e-07          5460     8613  1.572373e+06\n",
      "potami            0.000171  4.512462e-07          4038     4843  1.529477e+06\n",
      "repayment         0.000364  2.134824e-06          8600    22912  1.466421e+06\n",
      "juncker           0.000749  9.544839e-06         17692   102440  1.388063e+06\n",
      "syntagma          0.000273  1.305940e-06          6450    14016  1.348404e+06\n",
      "uble              0.000307  1.690473e-06          7264    18143  1.321196e+06\n",
      "dimopsifisma      0.000199  7.234101e-07          4695     7764  1.289765e+06\n",
      "lapavitsas        0.000138  3.665502e-07          3272     3934  1.236281e+06\n",
      "pasok             0.000244  1.164873e-06          5765    12502  1.207658e+06\n",
      "defaults          0.000287  1.645190e-06          6781    17657  1.183029e+06\n",
      "rtrs              0.000275  1.655625e-06          6502    17769  1.080826e+06\n",
      "yanis             0.000325  2.396180e-06          7686    25717  1.043531e+06\n",
      "lagarde           0.000368  3.097787e-06          8699    33247  1.033977e+06\n",
      "banks             0.002018  9.499118e-05         47670  1019493  1.012584e+06\n",
      "-----------\n",
      "======= immigration =======\n",
      "                   f_immigration       f_total  N_immigration   N_total  \\\n",
      "immigration             0.102015  1.020649e-04        1095412   1095412   \n",
      "detention               0.002706  2.108157e-05          29054    226258   \n",
      "uncontrolled            0.000697  1.591148e-06           7487     17077   \n",
      "ukip                    0.009590  4.099395e-04         102971   4399676   \n",
      "obama                   0.007070  4.014422e-04          75921   4308479   \n",
      "farage                  0.004272  1.565854e-04          45869   1680553   \n",
      "policy                  0.004638  1.924743e-04          49804   2065731   \n",
      "controls                0.001532  2.348194e-05          16454    252020   \n",
      "reform                  0.002636  7.495848e-05          28307    804492   \n",
      "leadersdebate           0.002953  1.074855e-04          31704   1153588   \n",
      "immigrants              0.001780  4.039916e-05          19113    433584   \n",
      "illegal                 0.002482  8.091618e-05          26652    868433   \n",
      "eu                      0.005112  3.551710e-04          54887   3811873   \n",
      "labour                  0.008035  8.782921e-04          86277   9426271   \n",
      "yarl                    0.000383  2.559329e-06           4113     27468   \n",
      "mug                     0.001266  3.070235e-05          13595    329513   \n",
      "bbcqt                   0.002955  1.766268e-04          31728   1895647   \n",
      "mass                    0.002071  8.828613e-05          22239    947531   \n",
      "bordersecurity          0.000093  1.802935e-07            999      1935   \n",
      "nigel                   0.002060  1.033931e-04          22117   1109667   \n",
      "ncustoms                0.000073  1.318425e-07            784      1415   \n",
      "time4atimelimit         0.000296  2.187188e-06           3181     23474   \n",
      "noamnesty               0.000203  1.128255e-06           2175     12109   \n",
      "debate                  0.002978  2.496712e-04          31978   2679596   \n",
      "immigrant               0.000792  1.766522e-05           8504    189592   \n",
      "migrants                0.001272  4.594586e-05          13661    493114   \n",
      "control                 0.002628  2.005737e-04          28222   2152658   \n",
      "anti                    0.003025  2.718705e-04          32479   2917851   \n",
      "cameron                 0.003751  4.251580e-04          40276   4563009   \n",
      "gop                     0.001724  9.196831e-05          18514    987050   \n",
      "immigrationaction       0.000180  1.020452e-06           1936     10952   \n",
      "uk                      0.005945  1.194076e-03          63838  12815427   \n",
      "asylum                  0.000892  2.718164e-05           9583    291727   \n",
      "rhetoric                0.000664  1.551940e-05           7134    166562   \n",
      "migration               0.000854  2.572569e-05           9166    276101   \n",
      "bbcdebate               0.000961  3.553114e-05          10318    381338   \n",
      "britain                 0.002613  2.709010e-04          28063   2907445   \n",
      "righttorent             0.000055  1.269974e-07            590      1363   \n",
      "dhs                     0.000409  7.078965e-06           4387     75975   \n",
      "tntweeters              0.000290  3.595527e-06           3119     38589   \n",
      "borders                 0.000726  2.389368e-05           7793    256439   \n",
      "policies                0.001216  6.748091e-05          13054    724239   \n",
      "coulter                 0.000264  3.281435e-06           2839     35218   \n",
      "system                  0.002167  2.275130e-04          23269   2441784   \n",
      "immigrationreform       0.000111  6.196133e-07           1187      6650   \n",
      "unskilled               0.000139  9.798275e-07           1490     10516   \n",
      "wages                   0.000845  3.657461e-05           9069    392537   \n",
      "border                  0.001036  5.709517e-05          11128    612774   \n",
      "criminally              0.000192  1.962823e-06           2060     21066   \n",
      "nhs                     0.002688  3.874176e-04          28858   4157960   \n",
      "\n",
      "                          score  \n",
      "immigration        1.094876e+09  \n",
      "detention          3.729026e+06  \n",
      "uncontrolled       3.280889e+06  \n",
      "ukip               2.408777e+06  \n",
      "obama              1.337172e+06  \n",
      "farage             1.251336e+06  \n",
      "policy             1.200168e+06  \n",
      "controls           1.073731e+06  \n",
      "reform             9.955279e+05  \n",
      "leadersdebate      8.708931e+05  \n",
      "immigrants         8.421160e+05  \n",
      "illegal            8.175433e+05  \n",
      "eu                 7.899289e+05  \n",
      "labour             7.892918e+05  \n",
      "yarl               6.155706e+05  \n",
      "mug                5.606261e+05  \n",
      "bbcqt              5.307810e+05  \n",
      "mass               5.217044e+05  \n",
      "bordersecurity     5.155104e+05  \n",
      "nigel              4.406028e+05  \n",
      "ncustoms           4.341733e+05  \n",
      "time4atimelimit    4.308516e+05  \n",
      "noamnesty          3.904790e+05  \n",
      "debate             3.814352e+05  \n",
      "immigrant          3.812536e+05  \n",
      "migrants           3.782728e+05  \n",
      "control            3.698180e+05  \n",
      "anti               3.613513e+05  \n",
      "cameron            3.553275e+05  \n",
      "gop                3.470954e+05  \n",
      "immigrationaction  3.420619e+05  \n",
      "uk                 3.178432e+05  \n",
      "asylum             3.146399e+05  \n",
      "rhetoric           3.054061e+05  \n",
      "migration          3.041440e+05  \n",
      "bbcdebate          2.790413e+05  \n",
      "britain            2.707348e+05  \n",
      "righttorent        2.552676e+05  \n",
      "dhs                2.531932e+05  \n",
      "tntweeters         2.519734e+05  \n",
      "borders            2.367079e+05  \n",
      "policies           2.351759e+05  \n",
      "coulter            2.287460e+05  \n",
      "system             2.216336e+05  \n",
      "immigrationreform  2.117714e+05  \n",
      "unskilled          2.110131e+05  \n",
      "wages              2.094236e+05  \n",
      "border             2.019860e+05  \n",
      "criminally         2.013445e+05  \n",
      "nhs                2.001887e+05  \n",
      "-----------\n",
      "======= ties_to_the_eu =======\n",
      "                   f_ties_to_the_eu       f_total  N_ties_to_the_eu   N_total  \\\n",
      "brexit                     0.098136  1.628800e-05            174811    174811   \n",
      "no2eu                      0.009398  3.540740e-06             16740     38001   \n",
      "betteroffout               0.003406  1.140927e-06              6067     12245   \n",
      "eureferendum               0.003486  3.214162e-06              6209     34496   \n",
      "eu                         0.036564  3.551710e-04             65132   3811873   \n",
      "euref                      0.004698  6.596878e-06              8369     70801   \n",
      "grexit                     0.004261  1.476468e-05              7590    158462   \n",
      "scoxit                     0.000167  3.214535e-08               298       345   \n",
      "stayineu                   0.000159  3.447472e-08               284       370   \n",
      "flexcit                    0.000214  6.307942e-08               381       677   \n",
      "referendum                 0.007313  8.649680e-05             13027    928327   \n",
      "ciuriak                    0.000108  1.910086e-08               192       205   \n",
      "yestoeu                    0.000197  7.649661e-08               351       821   \n",
      "ivotedukip                 0.000175  6.326577e-08               311       679   \n",
      "nothankeu                  0.000143  4.723969e-08               255       507   \n",
      "noxi                       0.000155  5.692988e-08               276       611   \n",
      "spexit                     0.000107  3.074773e-08               190       330   \n",
      "nunelected                 0.000145  5.860703e-08               258       629   \n",
      "efta                       0.000335  3.436291e-07               596      3688   \n",
      "frexit                     0.000163  8.162123e-08               290       876   \n",
      "uk                         0.019593  1.194076e-03             34902  12815427   \n",
      "scaremongers               0.000284  2.783135e-07               506      2987   \n",
      "anually                    0.000089  2.730025e-08               158       293   \n",
      "irexit                     0.000067  1.639879e-08               120       176   \n",
      "britty                     0.000083  2.590263e-08               148       278   \n",
      "eureform                   0.000449  7.752154e-07               800      8320   \n",
      "britain                    0.008334  2.709010e-04             14845   2907445   \n",
      "britinfluence              0.000050  9.969717e-09                89       107   \n",
      "britainisonhold            0.000039  6.522245e-09                70        70   \n",
      "jcb                        0.000674  2.007733e-06              1200     21548   \n",
      "brexitfollowtrain          0.000043  8.385743e-09                77        90   \n",
      "renegotiation              0.000819  3.019706e-06              1459     32409   \n",
      "europhile                  0.000324  4.875844e-07               578      5233   \n",
      "ukineurope                 0.000157  1.141393e-07               279      1225   \n",
      "eurosceptics               0.000587  1.661309e-06              1045     17830   \n",
      "ukip                       0.009038  4.099395e-04             16100   4399676   \n",
      "scexit                     0.000060  1.816911e-08               107       195   \n",
      "taskforce                  0.000696  2.463638e-06              1240     26441   \n",
      "iiffrankfurt               0.000053  1.481481e-08                95       159   \n",
      "notoeu                     0.000089  4.491031e-08               159       482   \n",
      "n02eu                      0.000030  5.217796e-09                54        56   \n",
      "brits4greece               0.000029  4.845096e-09                52        52   \n",
      "europe                     0.005957  2.115910e-04             10611   2270901   \n",
      "betonbrexit                0.000084  4.230142e-08               150       454   \n",
      "ruparel                    0.000115  8.162123e-08               205       876   \n",
      "yes2eu                     0.000132  1.131144e-07               235      1214   \n",
      "unexpelled                 0.000025  4.099697e-09                44        44   \n",
      "allarad                    0.000024  4.006522e-09                43        43   \n",
      "eurosceptic                0.000547  2.129140e-06               975     22851   \n",
      "canwebrexit                0.000023  3.820172e-09                41        41   \n",
      "\n",
      "                          score  \n",
      "brexit             1.053246e+09  \n",
      "no2eu              4.443008e+07  \n",
      "betteroffout       1.811133e+07  \n",
      "eureferendum       6.733421e+06  \n",
      "eu                 6.705198e+06  \n",
      "euref              5.960311e+06  \n",
      "grexit             2.190381e+06  \n",
      "scoxit             1.550867e+06  \n",
      "stayineu           1.313397e+06  \n",
      "flexcit            1.291881e+06  \n",
      "referendum         1.101410e+06  \n",
      "ciuriak            1.083452e+06  \n",
      "yestoeu            9.041329e+05  \n",
      "ivotedukip         8.582468e+05  \n",
      "nothankeu          7.727404e+05  \n",
      "noxi               7.511699e+05  \n",
      "spexit             6.591048e+05  \n",
      "nunelected         6.376024e+05  \n",
      "efta               5.803137e+05  \n",
      "frexit             5.784330e+05  \n",
      "uk                 5.727021e+05  \n",
      "scaremongers       5.164485e+05  \n",
      "anually            5.133431e+05  \n",
      "irexit             4.929593e+05  \n",
      "britty             4.747225e+05  \n",
      "eureform           4.634660e+05  \n",
      "britain            4.566780e+05  \n",
      "britinfluence      4.460232e+05  \n",
      "britainisonhold    4.217540e+05  \n",
      "jcb                4.026398e+05  \n",
      "brexitfollowtrain  3.969174e+05  \n",
      "renegotiation      3.957365e+05  \n",
      "europhile          3.846504e+05  \n",
      "ukineurope         3.828543e+05  \n",
      "eurosceptics       3.690137e+05  \n",
      "ukip               3.549705e+05  \n",
      "scexit             3.537481e+05  \n",
      "taskforce          3.503698e+05  \n",
      "iiffrankfurt       3.419883e+05  \n",
      "notoeu             3.160155e+05  \n",
      "n02eu              3.137334e+05  \n",
      "brits4greece       3.133030e+05  \n",
      "europe             2.987278e+05  \n",
      "betonbrexit        2.985987e+05  \n",
      "ruparel            2.890446e+05  \n",
      "yes2eu             2.740806e+05  \n",
      "unexpelled         2.651025e+05  \n",
      "allarad            2.590775e+05  \n",
      "eurosceptic        2.506486e+05  \n",
      "canwebrexit        2.470274e+05  \n",
      "-----------\n",
      "======= nhs =======\n",
      "                    f_nhs       f_total    N_nhs   N_total         score\n",
      "nhs              0.099105  3.874176e-04  4157960   4157960  1.063649e+09\n",
      "privatisation    0.003916  2.851013e-05   164293    305985  2.256607e+07\n",
      "8bn              0.001129  8.509666e-06    47365     91330  6.283761e+06\n",
      "privatising      0.000719  4.122338e-06    30168     44243  5.262190e+06\n",
      "reorganisation   0.000582  3.235127e-06    24418     34721  4.392846e+06\n",
      "tories           0.005821  3.422663e-04   244230   3673373  4.153852e+06\n",
      "ttip             0.002005  4.302147e-05    84135    461728  3.921798e+06\n",
      "nurses           0.001974  4.244593e-05    82839    455551  3.853459e+06\n",
      "patients         0.002700  8.022846e-05   113282    861052  3.812505e+06\n",
      "privatised       0.000768  6.841183e-06    32218     73423  3.616455e+06\n",
      "health           0.006482  4.932030e-04   271963   5293302  3.574465e+06\n",
      "private          0.003611  1.571562e-04   151502   1686679  3.481147e+06\n",
      "saveournhs       0.000620  5.037502e-06    26033     54065  3.206640e+06\n",
      "privatise        0.000599  4.753692e-06    25118     51019  3.163415e+06\n",
      "cameron          0.005478  4.251580e-04   229829   4563009  2.961259e+06\n",
      "labour           0.007682  8.782921e-04   322298   9426271  2.818989e+06\n",
      "staff            0.003884  2.334966e-04   162937   2506003  2.710041e+06\n",
      "hospitals        0.001372  3.051311e-05    57574    327482  2.589306e+06\n",
      "pfi              0.000596  6.185417e-06    24991     66385  2.406663e+06\n",
      "publicduty       0.000268  1.299790e-06    11263     13950  2.326226e+06\n",
      "dismantling      0.000480  4.634334e-06    20142     49738  2.086579e+06\n",
      "care             0.004783  4.736823e-04   200688   5083796  2.026622e+06\n",
      "tory             0.004158  3.582916e-04   174457   3845365  2.024684e+06\n",
      "doctors          0.001695  6.022259e-05    71103    646339  2.000937e+06\n",
      "healthcare       0.001591  5.405571e-05    66755    580153  1.964912e+06\n",
      "bringbackthenhs  0.000327  2.510039e-06    13702     26939  1.782808e+06\n",
      "crisis           0.002645  1.714793e-04   110957   1840402  1.711255e+06\n",
      "hospital         0.002589  1.703145e-04   108639   1827901  1.651722e+06\n",
      "weaponise        0.000229  1.462846e-06     9588     15700  1.497871e+06\n",
      "cuts             0.002716  2.073766e-04   113955   2225670  1.492533e+06\n",
      "hunt             0.001902  1.114870e-04    79785   1196534  1.360929e+06\n",
      "nomoregames      0.000195  1.288050e-06     8183     13824  1.239109e+06\n",
      "funding          0.002015  1.410774e-04    84557   1514113  1.207978e+06\n",
      "gp               0.001218  5.204789e-05    51115    558604  1.196495e+06\n",
      "lansley          0.000265  2.491684e-06    11116     26742  1.182010e+06\n",
      "gps              0.000879  2.793133e-05    36861    299773  1.159471e+06\n",
      "nhscrisis        0.000253  2.392173e-06    10612     25674  1.122067e+06\n",
      "trusts           0.000502  9.714045e-06    21078    104256  1.090124e+06\n",
      "notcameronsnhs   0.000165  1.042814e-06     6902     11192  1.088830e+06\n",
      "hinchingbrooke   0.000344  4.610295e-06    14414     49480  1.074131e+06\n",
      "burnham          0.000814  2.685711e-05    34145    288244  1.034694e+06\n",
      "services         0.002595  2.749992e-04   108874   2951429  1.027387e+06\n",
      "leadersdebate    0.001581  1.074855e-04    66346   1153588  9.761055e+05\n",
      "nyes             0.000591  1.525795e-05    24804    163756  9.610904e+05\n",
      "patient          0.001238  6.786228e-05    51944    728332  9.476750e+05\n",
      "780million       0.000094  3.950617e-07     3924      4240  9.289881e+05\n",
      "save             0.002919  3.904669e-04   122463   4190686  9.154670e+05\n",
      "spending         0.001638  1.293465e-04    68736   1388211  8.706248e+05\n",
      "bbcqt            0.001876  1.766268e-04    78688   1895647  8.355599e+05\n",
      "amp              0.012442  8.165663e-03   522024  87637987  7.954384e+05\n",
      "-----------\n",
      "======= ebola =======\n",
      "                f_ebola       f_total  N_ebola  N_total         score\n",
      "ebola          0.109503  1.155168e-04  1239784  1239784  1.175244e+09\n",
      "liberia        0.007021  1.071735e-05    79487   115024  5.206978e+07\n",
      "leone          0.006563  1.093091e-05    74305   117316  4.461294e+07\n",
      "sierra         0.006590  1.511484e-05    74609   162220  3.252820e+07\n",
      "outbreak       0.005927  1.783685e-05    67101   191434  2.229569e+07\n",
      "virus          0.005270  2.121323e-05    59662   227671  1.482074e+07\n",
      "sierraleone    0.001757  2.703750e-06    19897    29018  1.293272e+07\n",
      "cafferkey      0.001246  1.679385e-06    14109    18024  1.046944e+07\n",
      "quarantine     0.001381  2.948521e-06    15636    31645  7.323661e+06\n",
      "nurse          0.005063  4.935029e-05    57318   529652  5.879948e+06\n",
      "ebolaresponse  0.001108  2.684649e-06    12542    28813  5.175200e+06\n",
      "cdc            0.001797  7.123689e-06    20349    76455  5.134076e+06\n",
      "guinea         0.002228  1.267831e-05    25229   136070  4.434245e+06\n",
      "liberian       0.000700  1.360633e-06     7920    14603  4.071836e+06\n",
      "patient        0.004786  6.786228e-05    54181   728332  3.820733e+06\n",
      "africa         0.006862  1.415726e-04    77694  1519428  3.765970e+06\n",
      "infected       0.001482  8.213463e-06    16775    88151  3.026076e+06\n",
      "pauline        0.001317  6.754064e-06    14912    72488  2.907955e+06\n",
      "vaccine        0.001994  1.795015e-05    22576   192650  2.507881e+06\n",
      "epidemic       0.001630  1.209569e-05    18459   129817  2.488094e+06\n",
      "quarantined    0.000495  1.161053e-06     5600    12461  2.385641e+06\n",
      "quarantines    0.000273  4.138830e-07     3094     4442  2.042886e+06\n",
      "czar           0.000600  2.063825e-06     6790    22150  1.973094e+06\n",
      "cases          0.003207  5.964295e-05    36314   640118  1.952856e+06\n",
      "freetown       0.000322  6.077800e-07     3650     6523  1.936067e+06\n",
      "contracted     0.000707  3.035639e-06     8007    32580  1.865394e+06\n",
      "monrovia       0.000283  5.632424e-07     3200     6045  1.605779e+06\n",
      "klain          0.000182  2.499883e-07     2055     2683  1.492056e+06\n",
      "zmapp          0.000164  2.058234e-07     1862     2209  1.487804e+06\n",
      "stopthespread  0.000198  3.194036e-07     2237     3428  1.383800e+06\n",
      "ebolaoutbreak  0.000290  6.892149e-07     3280     7397  1.378714e+06\n",
      "liberians      0.000165  2.290240e-07     1869     2458  1.347159e+06\n",
      "pooley         0.000333  1.002562e-06     3772    10760  1.253468e+06\n",
      "timepoy        0.000183  3.281621e-07     2071     3522  1.154391e+06\n",
      "tested         0.001528  2.437204e-05    17300   261573  1.084629e+06\n",
      "msf            0.000566  3.359608e-06     6405    36057  1.078526e+06\n",
      "health         0.006850  4.932030e-04    77551  5293302  1.077036e+06\n",
      "gettozero      0.000112  1.451665e-07     1273     1558  9.859874e+05\n",
      "pham           0.000251  7.250873e-07     2840     7782  9.824885e+05\n",
      "hospital       0.003784  1.703145e-04    42847  1827901  9.520729e+05\n",
      "brantly        0.000121  1.748893e-07     1366     1877  9.423650e+05\n",
      "endebola       0.000193  4.532960e-07     2181     4865  9.268522e+05\n",
      "piot           0.000121  1.893314e-07     1372     2032  8.781456e+05\n",
      "westafrica     0.000296  1.185465e-06     3351    12723  8.366452e+05\n",
      "diagnosed      0.000972  1.320345e-05    11001   141706  8.095769e+05\n",
      "worker         0.001928  5.776081e-05    21823   619918  7.282436e+05\n",
      "newsad         0.000268  1.132541e-06     3039    12155  7.202586e+05\n",
      "ebolanews      0.000082  1.074307e-07      927     1153  7.065000e+05\n",
      "disease        0.001918  6.331068e-05    21713   679482  6.577241e+05\n",
      "nhsvolunteers  0.000092  1.501980e-07     1042     1612  6.384875e+05\n",
      "-----------\n",
      "======= economy =======\n",
      "            f_economy       f_total  N_economy   N_total         score\n",
      "cuts         0.034193  2.073766e-04    2225670   2225670  3.669748e+08\n",
      "benefits     0.025502  1.546660e-04    1659953   1659953  2.736978e+08\n",
      "budget       0.024065  1.459492e-04    1566400   1566400  2.582725e+08\n",
      "welfare      0.015771  9.564919e-05    1026555   1026555  1.692613e+08\n",
      "vat          0.004172  2.530398e-05     271575    271575  4.477806e+07\n",
      "osborne      0.003248  1.041891e-04     211405   1118210  6.589958e+06\n",
      "tax          0.005480  3.646732e-04     356715   3913856  5.360600e+06\n",
      "tory         0.004722  3.582916e-04     307380   3845365  4.051251e+06\n",
      "disabled     0.002290  1.044632e-04     149091   1121151  3.268995e+06\n",
      "tories       0.004024  3.422663e-04     261912   3673373  3.079082e+06\n",
      "spending     0.002346  1.293465e-04     152687   1388211  2.769008e+06\n",
      "austerity    0.002142  1.229664e-04     139436   1319737  2.429058e+06\n",
      "cut          0.003106  2.626053e-04     202202   2818412  2.391894e+06\n",
      "reform       0.001444  7.495848e-05      93985    804492  1.810385e+06\n",
      "benefit      0.001806  1.242713e-04     117561   1333742  1.708560e+06\n",
      "ids          0.000904  3.118444e-05      58846    334687  1.705967e+06\n",
      "nhs          0.003111  3.874176e-04     202524   4157960  1.626478e+06\n",
      "ifs          0.000611  1.509564e-05      39783    162014  1.610713e+06\n",
      "labour       0.004488  8.782921e-04     292143   9426271  1.492886e+06\n",
      "disability   0.001074  5.113682e-05      69917    548826  1.468610e+06\n",
      "budget2015   0.000802  3.203186e-05      52232    343782  1.308476e+06\n",
      "health       0.003047  4.932030e-04     198303   5293302  1.224919e+06\n",
      "cameron      0.002727  4.251580e-04     177488   4563009  1.138314e+06\n",
      "reforms      0.000690  2.779622e-05      44908    298323  1.114644e+06\n",
      "government   0.002527  3.879648e-04     164500   4163833  1.071554e+06\n",
      "council      0.001918  2.288134e-04     124867   2455740  1.046860e+06\n",
      "deficit      0.000819  4.264169e-05      53302    457652  1.023593e+06\n",
      "dwp          0.000648  2.732709e-05      42178    293288  1.000122e+06\n",
      "iain         0.000611  2.630114e-05      39744    282277  9.226632e+05\n",
      "claimants    0.000374  1.017601e-05      24375    109214  8.969876e+05\n",
      "duncan       0.000717  3.731069e-05      46658    400437  8.963829e+05\n",
      "poorest      0.000521  2.010230e-05      33895    215748  8.780111e+05\n",
      "billion      0.001227  1.200846e-04      79865   1288808  8.160190e+05\n",
      "poverty      0.001139  1.075682e-04      74144   1154476  7.851319e+05\n",
      "funding      0.001280  1.410774e-04      83298   1514113  7.555903e+05\n",
      "surplus      0.000323  9.005077e-06      21001     96647  7.524317e+05\n",
      "aid          0.001132  1.175478e-04      73664   1261582  7.092031e+05\n",
      "says         0.003662  1.241277e-03     238353  13322002  7.031489e+05\n",
      "sanctions    0.000741  5.100321e-05      48263    547392  7.016261e+05\n",
      "chancellor   0.000559  3.011153e-05      36410    323172  6.763682e+05\n",
      "amp          0.009116  8.165663e-03     593355  87637987  6.623878e+05\n",
      "animal       0.001099  1.216620e-04      71504   1305738  6.456247e+05\n",
      "services     0.001624  2.749992e-04     105717   2951429  6.243568e+05\n",
      "people       0.005100  2.783569e-03     331979  29874657  6.082664e+05\n",
      "pay          0.001854  3.779448e-04     120666   4056293  5.918560e+05\n",
      "govt         0.001159  1.498904e-04      75461   1608699  5.836411e+05\n",
      "public       0.001885  4.081900e-04     122697   4380900  5.666046e+05\n",
      "12billion    0.000074  6.402049e-07       4847      6871  5.637696e+05\n",
      "george       0.001418  2.337837e-04      92297   2509084  5.598034e+05\n",
      "cap          0.000669  5.570659e-05      43517    597871  5.222590e+05\n",
      "-----------\n"
     ]
    }
   ],
   "source": [
    "for subset in tf_ranked:\n",
    "    print('======= ' + subset + ' =======')\n",
    "    print(tf_ranked[subset].head(50))\n",
    "    print(\"-----------\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open(os.path.join(DATA_FOLDER, \"topics-25-extension-keywords.json\"), 'wt') as outfile:\n",
    "    json.dump({topic: list(tf_ranked[topic].index[:25]) for topic in tf_ranked}, outfile, indent=4)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
